
@article{radin_801_1982,
	title = {The 801 minicomputer},
	volume = {17},
	issn = {0362-1340},
	url = {https://doi.org/10.1145/960120.801824},
	doi = {10.1145/960120.801824},
	abstract = {This paper provides an overview of an experimental system developed at the IBM T. J. Watson Research Center. It consists of a running hardware prototype, a control program and an optimizing compiler. The basic concepts underlying the system are discussed as are the performance characteristics of the prototype. In particular, three principles are examined: system orientation towards the pervasive use of high level language programming and a sophisticated compiler, a primitive instruction set which can be completely hard-wired, storage hierarchy and I/O organization to enable the CPU to execute an instruction at almost every cycle.},
	number = {4},
	urldate = {2021-07-17},
	journal = {ACM SIGPLAN Notices},
	author = {Radin, George},
	month = mar,
	year = {1982},
	pages = {39--47},
}

@book{weaver_sparc_1994,
	address = {Englewood Cliffs},
	title = {The {SPARC} architecture manual: version 9},
	isbn = {978-0-13-099227-7},
	shorttitle = {The {SPARC} architecture manual},
	language = {eng},
	publisher = {PTR Prentice-Hall},
	author = {Weaver, David L. and Germond, Tom},
	year = {1994},
}

@article{kessler_alpha_1999,
	title = {The {Alpha} 21264 microprocessor},
	volume = {19},
	issn = {02721732},
	url = {http://ieeexplore.ieee.org/document/755465/},
	doi = {10.1109/40.755465},
	number = {2},
	urldate = {2021-07-17},
	journal = {IEEE Micro},
	author = {Kessler, R.E.},
	month = apr,
	year = {1999},
	pages = {24--36},
}

@article{gronowski_433-mhz_1996,
	title = {A 433-{MHz} 64-b quad-issue {RISC} microprocessor},
	volume = {31},
	issn = {0018-9200},
	url = {http://ieeexplore.ieee.org/document/542313/},
	doi = {10.1109/JSSC.1996.542313},
	language = {en},
	number = {11},
	urldate = {2021-07-17},
	journal = {IEEE Journal of Solid-State Circuits},
	author = {Gronowski, P.E. and Bowhill, W.J. and Donchin, D.R. and Blake-Campos, R.P. and Carlson, D.A. and Equi, E.R. and Loughlin, B.J. and Mehta, S. and Mueller, R.O. and Olesin, A. and Noorlag, D.J.W. and Preston, R.P.},
	month = nov,
	year = {1996},
	pages = {1687--1696},
}

@book{may_powerpc_1994,
	address = {San Francisco},
	edition = {2},
	title = {The {PowerPC} architecture: a specification for a new family of {RISC} processors},
	isbn = {978-1-55860-316-5},
	shorttitle = {The {PowerPC} architecture},
	publisher = {Morgan Kaufman Publishers},
	editor = {May, Cathy},
	year = {1994},
	keywords = {PowerPC microprocessors},
}

@article{yeager_mips_1996,
	title = {The {Mips} {R10000} superscalar microprocessor},
	volume = {16},
	issn = {02721732},
	url = {http://ieeexplore.ieee.org/document/491460/},
	doi = {10.1109/40.491460},
	number = {2},
	urldate = {2021-07-17},
	journal = {IEEE Micro},
	author = {Yeager, Kenneth C.},
	month = apr,
	year = {1996},
	pages = {28--41},
	file = {Yeager - 1996 - The Mips R10000 superscalar microprocessor.pdf:files/3908/Yeager - 1996 - The Mips R10000 superscalar microprocessor.pdf:application/pdf},
}

@book{seal_arm_2006,
	address = {Harlow},
	title = {{ARM} architecture reference manual},
	isbn = {978-0-201-73719-6},
	language = {eng},
	publisher = {Addison-Wesley},
	editor = {Seal, David},
	year = {2006},
}

@techreport{thorton_considerations_1963,
	title = {Considerations in {Computer} {Design} - {Leading} up to the {Control} {Data} 6600},
	author = {Thorton, James},
	year = {1963},
	file = {CDC.6600.1963.102641207.pdf:files/3831/CDC.6600.1963.102641207.pdf:application/pdf},
}

@techreport{schlansker_michael_epic_2000,
	title = {{EPIC}: {An} {Architecture} for {Instruction}-{Level} {Parallel} {Processors}},
	number = {HPL\_1999-111},
	institution = {HP Laboratories Palo Alto},
	author = {Schlansker, Michael and Rau, B. Ramakrishna},
	year = {2000},
	file = {HPL-1999-111.pdf:files/3839/HPL-1999-111.pdf:application/pdf},
}

@misc{arm_amba_1999,
	title = {{AMBA} specifications ({Rev} 2.0)},
	url = {https://developer.arm.com/documentation/ihi0011/a/},
	author = {ARM},
	year = {1999},
}

@techreport{amd_hypertransport_2010,
	title = {{HyperTransport} {I}/{O} link specification revision 3.10},
	url = {http://www.hypertransport.org},
	institution = {HyperTransport Technology Consortium},
	author = {AMD},
	year = {2010},
}

@techreport{pci-sig_pci_2002,
	title = {{PCI} {Local} {Bus} {Specification} {Revision} 2.3},
	url = {https://pcisig.com},
	author = {PCI-SIG},
	year = {2002},
}

@techreport{pci-sig_pci_2006,
	title = {{PCI} {Express} 2.0 {Base} {Specification} {Revision} 1.0},
	url = {https://pcisig.com},
	author = {PCI-SIG},
	year = {2006},
}

@techreport{jedec_ddr2_2009,
	title = {{DDR2} {SDRAM} {SPECIFICATION}},
	url = {https://www.jedec.org/standards-documents/docs/jesd-79-2e},
	author = {JEDEC},
	year = {2009},
}

@article{alverson_tera_1990,
	title = {The {Tera} computer system},
	volume = {18},
	issn = {0163-5964},
	url = {https://dl.acm.org/doi/10.1145/255129.255132},
	doi = {10.1145/255129.255132},
	language = {en},
	number = {3b},
	urldate = {2021-07-17},
	journal = {ACM SIGARCH Computer Architecture News},
	author = {Alverson, Robert and Callahan, David and Cummings, Daniel and Koblenz, Brian and Porterfield, Allan and Smith, Burton},
	month = sep,
	year = {1990},
	pages = {1--6},
}

@article{anderson_performance_1990,
	title = {The performance of spin lock alternatives for shared-money multiprocessors},
	volume = {1},
	issn = {1558-2183},
	doi = {10.1109/71.80120},
	abstract = {The author examines the questions of whether there are efficient algorithms for software spin-waiting given hardware support for atomic instructions, or whether more complex kinds of hardware support are needed for performance. He considers the performance of a number of software spin-waiting algorithms. Arbitration for control of a lock is in many ways similar to arbitration for control of a network connecting a distributed system. He applies several of the static and dynamic arbitration methods originally developed for networks to spin locks. A novel method is proposed for explicitly queueing spinning processors in software by assigning each a unique number when it arrives at the lock. Control of the lock can then be passed to the next processor in line with minimal effect on other processors.{\textless}{\textgreater}},
	number = {1},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	author = {Anderson, T.E.},
	month = jan,
	year = {1990},
	note = {Conference Name: IEEE Transactions on Parallel and Distributed Systems},
	keywords = {Hardware, Software performance, Data structures, Bandwidth, Control systems, Costs, Marine technology, Read-write memory, Software algorithms, Spinning},
	pages = {6--16},
	file = {IEEE Xplore Abstract Record:files/3858/80120.html:text/html},
}

@article{graunke_synchronization_1990,
	title = {Synchronization algorithms for shared-memory multiprocessors},
	volume = {23},
	issn = {1558-0814},
	doi = {10.1109/2.55501},
	abstract = {A performance evaluation of the Symmetry multiprocessor system revealed that the synchronization mechanism did not perform well for highly contested locks, like those found in certain parallel applications. Several software synchronization mechanisms were developed and evaluated, using a hardware monitor, on the Symmetry multiprocessor system; the mechanisms were to reduce contention for the lock. The mechanisms remain valuable even when changes are made to the hardware synchronization mechanism to improve support for highly contested locks. The Symmetry architecture is described, and a number of lock algorithms and their use of hardware resources are examined. The performance of each lock is observed from the perspective of both the program itself and the total system performance.{\textless}{\textgreater}},
	number = {6},
	journal = {Computer},
	author = {Graunke, G. and Thakkar, S.},
	month = jun,
	year = {1990},
	note = {Conference Name: Computer},
	keywords = {Application software, Hardware, Computer architecture, Multiprocessing systems, Monitoring, Performance evaluation, System performance},
	pages = {60--69},
	file = {IEEE Xplore Abstract Record:files/3860/55501.html:text/html},
}

@article{yew_distributing_1987,
	title = {Distributing {Hot}-{Spot} {Addressing} in {Large}-{Scale} {Multiprocessors}},
	volume = {C-36},
	issn = {1557-9956},
	doi = {10.1109/TC.1987.1676921},
	abstract = {When a large number of processors try to access a common variable, referred to as hot-spot accesses in [6], not only can the resulting memory contention seriously degrade performance, but it can also cause tree saturation in the interconnection network which blocks both hot and regular requests alike. It is shown in [6] that even if only a small percentage of all requests are to a hot-spot, these requests can cause very serious performances problems, and networks that do the necessary combining of requests are suggested to keep the interconnection network and memory contention from becoming a bottleneck.},
	number = {4},
	journal = {IEEE Transactions on Computers},
	author = {Yew, Pen-Chung and Tzeng, Nian-Feng and {Lawrie}},
	month = apr,
	year = {1987},
	note = {Conference Name: IEEE Transactions on Computers},
	keywords = {Combining networks, hot-spot memory, memory bandwidth, memory contention, software combining tree, synchronization},
	pages = {388--395},
}

@book{dally_principles_2004,
	address = {San Francisco, CA, USA},
	title = {Principles and {Practices} of {Interconnection} {Networks}},
	isbn = {978-0-08-049780-8},
	abstract = {One of the greatest challenges faced by designers of digital systems is optimizing the communication and interconnection between system components. Interconnection networks offer an attractive and economical solution to this communication crisis and are fast becoming pervasive in digital systems. Current trends suggest that this communication bottleneck will be even more problematic when designing future generations of machines. Consequently, the anatomy of an interconnection network router and science of interconnection network design will only grow in importance in the coming years. This book offers a detailed and comprehensive presentation of the basic principles of interconnection network design, clearly illustrating them with numerous examples, chapter exercises, and case studies. It incorporates hardware-level descriptions of concepts, allowing a designer to see all the steps of the process from abstract design to concrete implementation. \&\#183;Case studies throughout the book draw on extensive author experience in designing interconnection networks over a period of more than twenty years, providing real world examples of what works, and what doesn't. \&\#183;Tightly couples concepts with implementation costs to facilitate a deeper understanding of the tradeoffs in the design of a practical network. \&\#183;A set of examples and exercises in every chapter help the reader to fully understand all the implications of every design decision. Table of Contents Chapter 1 Introduction to Interconnection Networks 1.1 Three Questions About Interconnection Networks 1.2 Uses of Interconnection Networks 1.3 Network Basics 1.4 History 1.5 Organization of this Book Chapter 2 A Simple Interconnection Network 2.1 Network Specifications and Constraints 2.2 Topology 2.3 Routing 2.4 Flow Control 2.5 Router Design 2.6 Performance Analysis 2.7 Exercises Chapter 3 Topology Basics 3.1 Nomenclature 3.2 Traffic Patterns 3.3 Performance 3.4 Packaging Cost 3.5 Case Study: The SGI Origin 2000 3.6 Bibliographic Notes 3.7 Exercises Chapter 4 Butterfly Networks 4.1 The Structure of Butterfly Networks 4.2 Isomorphic Butterflies 4.3 Performance and Packaging Cost 4.4 Path Diversity and Extra Stages 4.5 Case Study: The BBN Butterfly 4.6 Bibliographic Notes 4.7 Exercises Chapter 5 Torus Networks 5.1 The Structure of Torus Networks 5.2 Performance 5.3 Building Mesh and Torus Networks 5.4 Express Cubes 5.5 Case Study: The MIT J-Machine 5.6 Bibliographic Notes 5.7 Exercises Chapter 6 Non-Blocking Networks 6.1 Non-Blocking vs. Non-Interfering Networks 6.2 Crossbar Networks 6.3 Clos Networks 6.4 Benes Networks 6.5 Sorting Networks 6.6 Case Study: The Velio VC2002 (Zeus) Grooming Switch 6.7 Bibliographic Notes 6.8 Exercises Chapter 7 Slicing and Dicing 7.1 Concentrators and Distributors 7.2 Slicing and Dicing 7.3 Slicing Multistage Networks 7.4 Case Study: Bit Slicing in the Tiny Tera 7.5 Bibliographic Notes 7.6 Exercises Chapter 8 Routing Basics 8.1 A Routing Example 8.2 Taxonomy of Routing Algorithms 8.3 The Routing Relation 8.4 Deterministic Routing 8.5 Case Study: Dimension-Order Routing in the Cray T3D 8.6 Bibliographic Notes 8.7 Exercises Chapter 9 Oblivious Routing 9.1 Valiant's Randomized Routing Algorithm 9.2 Minimal Oblivious Routing 9.3 Load-Balanced Oblivious Routing 9.4 Analysis of Oblivious Routing 9.5 Case Study: Oblivious Routing in the Avici Terabit Switch Router(TSR) 9.6 Bibliographic Notes 9.7 Exercises Chapter 10 Adaptive Routing 10.1 Adaptive Routing Basics 10.2 Minimal Adaptive Routing 10.3 Fully Adaptive Routing 10.4 Load-Balanced Adaptive Routing 10.5 Search-Based Routing 10.6 Case Study: Adaptive Routing in the Thinking Machines CM-5 10.7 Bibliographic Notes 10.8 Exercises Chapter 11 Routing Mechanics 11.1 Table-Based Routing 11.2 Algorithmic Routing 11.3 Case Study: Oblivious Source Routing in the IBM Vulcan Network 11.4 Bibliographic Notes 11.5 Exercises Chapter 12 Flow Control Basics 12.1 Resources and Allocation Units 12.2 Bufferless Flow Control 12.3 Circuit Switching 12.4 Bibliographic Notes 12.5 Exercises Chapter 13 Buffered Flow Control 13.1 Packet-Buffer Flow Control 13.2 Flit-Buffer Flow Control 13.3 Buffer Management and Backpressure 13.4 Flit-Reservation Flow Control 13.5 Bibliographic Notes 13.6 Exercises Chapter 14 Deadlock and Livelock 14.1 Deadlock 14.2 Deadlock Avoidance 14.3 Adaptive Routing 14.4 Deadlock Recovery 14.5 Livelock 14.6 Case Study: Deadlock Avoidance in the Cray T3E 14.7 Bibliographic Notes 14.8 Exercises Chapter 15 Quality of Service 15.1 Service Classes and Service Contracts 15.2 Burstiness and Network Delays 15.3 Implementation of Guaranteed Services 15.4 Implementation of Best-Effort Services 15.5 Separation of Resources 15.6 Case Study: ATM Service Classes 15.7 Case Study: Virtual Networks in the Avici TSR 15.8 Bibliographic Notes 15.9 Exercises Chapter 16 Router Architecture 16.1 Basic Router Architecture 16.2 Stalls 16.3 Closing the Loop with Credits 16.4 Reallocating a Channel 16.5 Speculation and Lookahead 16.6 Flit and Credit Encoding 16.7 Case Study: The Alpha 21364 Router 16.8 Bibliographic Notes 16.9 Exercises Chapter 17 Router Datapath Components 17.1 Input Buffer Organization 17.2 Switches 17.3 Output Organization 17.4 Case Study: The Datapath of the IBM Colony Router 17.5 Bibliographic Notes 17.6 Exercises Chapter 18 Arbitration 18.1 Arbitration Timing 18.2 Fairness 18.3 Fixed Priority Arbiter 18.4 Variable Priority Iterative Arbiters 18.5 Matrix Arbiter 18.6 Queuing Arbiter 18.7 Exercises Chapter 19 Allocation 19.1 Representations 19.2 Exact Algorithms 19.3 Separable Allocators 19.4 Wavefront Allocator 19.5 Incremental vs. Batch Allocation 19.6 Multistage Allocation 19.7 Performance of Allocators 19.8 Case Study: The Tiny Tera Allocator 19.9 Bibliographic Notes 19.10 Exercises Chapter 20 Network Interfaces 20.1 Processor-Network Interface 20.2 Shared-Memory Interface 20.3 Line-Fabric Interface 20.4 Case Study: The MIT M-Machine Network Interface 20.5 Bibliographic Notes 20.6 Exercises Chapter 21 Error Control 411 21.1 Know Thy Enemy: Failure Modes and Fault Models 21.2 The Error Control Process: Detection, Containment, and Recovery 21.3 Link Level Error Control 21.4 Router Error Control 21.5 Network-Level Error Control 21.6 End-to-end Error Control 21.7 Bibliographic Notes 21.8 Exercises Chapter 22 Buses 22.1 Bus Basics 22.2 Bus Arbitration 22.3 High Performance Bus Protocol 22.4 From Buses to Networks 22.5 Case Study: The PCI Bus 22.6 Bibliographic Notes 22.7 Exercises Chapter 23 Performance Analysis 23.1 Measures of Interconnection Network Performance 23.2 Analysis 23.3 Validation 23.4 Case Study: Efficiency and Loss in the BBN Monarch Network 23.5 Bibliographic Notes 23.6 Exercises Chapter 24 Simulation 24.1 Levels of Detail 24.2 Network Workloads 24.3 Simulation Measurements 24.4 Simulator Design 24.5 Bibliographic Notes 24.6 Exercises Chapter 25 Simulation Examples 495 25.1 Routing 25.2 Flow Control Performance 25.3 Fault Tolerance Appendix A Nomenclature Appendix B Glossary Appendix C Network Simulator},
	publisher = {Morgan Kaufmann Publishers Inc.},
	author = {Dally, William James and Towles, Brian Patrick},
	year = {2004},
	file = {Full Text PDF:files/3865/Dally 和 Towles - 2004 - Principles and Practices of Interconnection Networ.pdf:application/pdf},
}

@book{__2011,
	title = {并行计算: 结构. 算法. 编程},
	isbn = {978-7-04-033742-6},
	shorttitle = {并行计算},
	abstract = {本书以并行计算为主题, 讨论并行计算的硬件基础, 即当代并行计算机系统及其结构模型, 并行计算的核心内容并行算法设计与并行数值算法, 以及并行计算的软件支持并行程序和设计原理与方法等.},
	language = {zh},
	publisher = {高等教育出版社},
	author = {陈国良},
	year = {2011},
	note = {Google-Books-ID: ltwBoQEACAAJ},
}

@book{__2001,
	title = {共享存储系统结构},
	isbn = {978-7-04-009849-5},
	abstract = {本书研究了共享存贮系统结构中的关键问题。主要包括一个共享存贮系统的执行正确性模型,讨论了正确执行的访存次序的条件,提出一个在顺序一致的共享存贮系统中实现乱序执行的方案,并对其进行模拟。建立了一个描述存贮一致型模型的数学模型等。},
	language = {zh},
	publisher = {高等教育出版社},
	author = {胡伟武},
	year = {2001},
	note = {Google-Books-ID: JqZBAAAACAAJ},
}

@article{_1_2003,
	title = {龙芯1号处理器结构设计},
	volume = {26},
	number = {004},
	journal = {计算机学报},
	author = {{胡伟武} and {唐志敏}},
	year = {2003},
	pages = {385--396},
}

@article{hu_microarchitecture_2005,
	title = {Microarchitecture of the {Godson}-2 {Processor}},
	volume = {20},
	number = {2},
	journal = {计算机科学技术学报(英文版)},
	author = {Hu, W. W. and Zhang, F. X. and Li, Z. S.},
	year = {2005},
	pages = {243--249},
}

@article{hu_godson-3_2009,
	title = {Godson-3: {A} {Scalable} {Multicore} {RISC} {Processor} with x86 {Emulation}},
	volume = {29},
	number = {2},
	journal = {IEEE Micro},
	author = {Hu, Wei Wu and Wang, Jian and Gao, Xiang and Chen, Yun Ji and Li, Guo Jie},
	year = {2009},
	pages = {17--29},
}

@inproceedings{hu_godson-3b_2011,
	address = {San Francisco, CA},
	title = {Godson-{3B}: {A} {1GHz} {40W} 8-core {128GFLOPS} processor in 65nm {CMOS}},
	booktitle = {{IEEE} {International} {Solid}-{State} {Circuits} {Conference}, {ISSCC} 2011, {Digest} of {Technical} {Papers}, {San} {Francisco}, {CA}, {USA}, 20-24 {February}, 2011},
	author = {Hu, Wei Wu and Wang, Ru and Chen, Yun Ji and Fan, Bao Xia and Zhong, Shi Qiang and Qi, Zi Chu and Yang, Xu},
	year = {2011},
}

@inproceedings{hu_godson-3b1500_2013,
	title = {Godson-{3B1500}: {A} 32nm 1.{35GHz} {40W} 172.{8GFLOPS} 8-core processor},
	booktitle = {Solid-{State} {Circuits} {Conference} {Digest} of {Technical} {Papers} ({ISSCC}), 2013 {IEEE} {International}},
	author = {Hu, Wei Wu and Zhang, Y. and Yang, Liang and Fan, Bao Xia and Chen, Shuai},
	year = {2013},
}

@article{hu_8-core_2013,
	title = {An 8-{Core} {MIPS}-{Compatible} {Processor} in 32/28 nm {Bulk} {CMOS}},
	volume = {49},
	number = {1},
	journal = {IEEE Journal of Solid-State Circuits},
	author = {Hu, W. and Yang, L. and Fan, B. and Wang, H.},
	year = {2013},
	pages = {41--49},
}

@article{_gs464e_2015,
	title = {{龙芯GS464E处理器核架构设计}},
	volume = {45},
	number = {4},
	journal = {中国科学:信息科学},
	author = {{吴瑞阳} and {汪文祥} and {王焕东} and {胡伟武}},
	year = {2015},
	pages = {480--500},
}

@article{rotem_power-management_2012,
	title = {Power-{Management} {Architecture} of the {Intel} {Microarchitecture} {Code}-{Named} {Sandy} {Bridge}},
	volume = {32},
	doi = {10.1109/MM.2012.12},
	number = {2},
	journal = {IEEE Micro},
	author = {Rotem, Efraim and Naveh, Alon and Ananthakrishnan, Avinash and Weissmann, Eliezer and Rajwan, Doron},
	year = {2012},
	pages = {20--27},
}

@article{mellor-crummey_algorithms_1991,
	title = {Algorithms for scalable synchronization on shared-memory multiprocessors},
	volume = {9},
	url = {https://doi.org/10.1145/103727.103729},
	doi = {10.1145/103727.103729},
	number = {1},
	journal = {ACM Transactions on Computer Systems},
	author = {Mellor-Crummey, John M. and Scott, Michael L.},
	year = {1991},
	note = {Publisher: Association for Computing Machinery (ACM)},
	pages = {21--65},
}

@inproceedings{agarwal_mit_1995,
	title = {The {MIT} {Alewife} machine: architecture and performance},
	shorttitle = {The {MIT} {Alewife} machine},
	doi = {10.1109/ISCA.1995.524544},
	abstract = {Alewife is a multiprocessor architecture that supports up to 512 processing nodes connected over a scalable and cost-effective mesh network at a constant cost per node. The MIT Alewife machine, a prototype implementation of the architecture, demonstrates that a parallel system can be both scalable and programmable. Four mechanisms combine to achieve these goals: software-extended coherent shared memory provides a global, linear address space; integrated message passing allows compiler and operating system designers to provide efficient communication and synchronization; support for fine-grain computation allows many processors to cooperate on small problem sizes; and latency tolerance mechanisms-including block multithreading and prefetching-mask unavoidable delays due to communication. Microbenchmarks, together with over a dozen complete applications running on the 32-node prototype, help to analyze the behavior of the system. Analysis shows that integrating message passing with shared memory enables a cost-efficient solution to the cache coherence problem and provides a rich set of programming primitives. Block multithreading and prefetching improve performance by up to 25\%, individually, and 35\% together. Finally, language constructs that allow programmers to express fine-grain synchronization can improve performance by over a factor of two.},
	booktitle = {Proceedings 22nd {Annual} {International} {Symposium} on {Computer} {Architecture}},
	author = {Agarwal, A. and Bianchini, R. and Chaiken, D. and Johnson, K.L. and Kranz, D. and Kubiatowicz, J. and Lim, Beng-Hong and Mackenzie, K. and Yeung, D.},
	month = jun,
	year = {1995},
	note = {ISSN: 1063-6897},
	keywords = {Operating systems, Computer architecture, Prototypes, Costs, Delay, Mesh networks, Message passing, Multithreading, Prefetching, Software prototyping},
	pages = {2--13},
	file = {IEEE Xplore Abstract Record:files/3885/524544.html:text/html},
}

@techreport{nvidia_nvidias_2009,
	type = {white paper},
	title = {{NVIDIA}'s {Next} {Generation} {CUDA} {Computer} {Architecture}},
	author = {Nvidia},
	year = {2009},
}

@misc{strohmaier_top500_nodate,
	title = {{TOP500} list},
	url = {https://www.top500.org/},
	author = {Strohmaier, Erich and Dongarra, Jack and Simon, Horst and Meuer, Martin and Meuer, Hans},
}

@article{desikan_sim-alpha_2002,
	title = {Sim-alpha: a {Validated}, {Execution}-{Driven} {Alpha} 21264 {Simulator}},
	shorttitle = {Sim-alpha},
	abstract = {This technical report describes installation, use, and design of sim-alpha, an execution driven Alpha 21264 simulator. To increase simulator accuracy, we have incorporated many of the low level features found in the Alpha 21264. When compared to a hardware 21264 implementation, sim-alpha achieves 2\% error across a suite of microbenchmarks designed to stress the various microarchitectural features in the simulator. The error across the 10 SPECINT 2000 benchmarks is 6.6\% and the 12 SPECFP 2000 benchmarks is 21\%, with the net error being 15\% across the 22 of the 26 SPECCPU 2000 benchmarks.},
	author = {Desikan, Rajagopalan and Burger, Doug and Keckler, Stephen and Austin, Todd},
	month = jan,
	year = {2002},
}

@inproceedings{bienia_parsec_2008,
	title = {The {PARSEC} benchmark suite: {Characterization} and architectural implications},
	shorttitle = {The {PARSEC} benchmark suite},
	abstract = {This paper presents and characterizes the Princeton Application Repository for Shared-Memory Computers (PARSEC), a benchmark suite for studies of Chip-Multiprocessors (CMPs). Previous available benchmarks for multiprocessors have focused on high-performance computing applications and used a limited number of synchronization methods. PARSEC includes emerging applications in recognition, mining and synthesis (RMS) as well as systems applications which mimic large-scale multithreaded commercial programs. Our characterization shows that the benchmark suite covers a wide spectrum of working sets, locality, data sharing, synchronization and off-chip traffic. The benchmark suite has been made available to the public.},
	booktitle = {2008 {International} {Conference} on {Parallel} {Architectures} and {Compilation} {Techniques} ({PACT})},
	author = {Bienia, Christian and Kumar, Sanjeev and Singh, Jaswinder Pal and Li, Kai},
	month = oct,
	year = {2008},
	keywords = {Benchmark testing, Program processors, Computers, Algorithm design and analysis, Animation, benchmark suite, Computational modeling, multithreading, performance measurement, shared-memory computers, Synchronization},
	pages = {72--81},
	file = {IEEE Xplore Abstract Record:files/3891/7849432.html:text/html},
}

@article{bose_challenges_1999,
	title = {Challenges in processor modeling and validation},
	volume = {19},
	journal = {IEEE Micro},
	author = {Bose, P. and Conte, T. and Austin, T.},
	year = {1999},
	pages = {9--14},
}

@article{bird_performance_2007,
	title = {Performance characterization of {SPEC} {CPU} benchmarks on intel's core microarchitecture based processor},
	url = {https://www.semanticscholar.org/paper/Simulation-of-computer-architectures%3A-simulators%2C-Yi-Lilja/cc22e8fa2ce6d5144d556dd7f48b4373a7d04292},
	abstract = {Simulators have become an integral part of the computer architecture research and design process. Since they have the advantages of cost, time, and flexibility, architects use them to guide design space exploration and to quantify the efficacy of an enhancement. However, long simulation times and poor accuracy limit their effectiveness. To reduce the simulation time, architects have proposed several techniques that increase the simulation speed or throughput. To increase the accuracy, architects try to minimize the amount of error in their simulators and have proposed adding statistical rigor to their simulation methodology. Since a wide range of approaches exist and since many of them overlap, this paper describes, classifies, and compares them to aid the computer architect in selecting the most appropriate one.},
	language = {en},
	urldate = {2021-07-17},
	author = {Bird, Sarah and Phansalkar, Aashish and John, L. and Mericas, A. and Indukuru, Rajeev},
	year = {2007},
	file = {Snapshot:files/3896/cc22e8fa2ce6d5144d556dd7f48b4373a7d04292.html:text/html},
}

@article{srinivas_ibm_2011,
	title = {{IBM} {POWER7} performance modeling, verification, and evaluation},
	volume = {55},
	journal = {Journal of Reproduction and Development},
	author = {Srinivas, Mysore Sathyanarayana and Sinharoy, B. and Eickemeyer, R. and Raghavan, R. and Kunkel, S. and Chen, Thomas and Maron, William A. and Flemming, Diane G. and Blanchard, A. and Seshadri, P. and Kellington, J. W. and Mericas, A. and Petruski, A. E. and Indukuru, Venkat R. and Reyes, S.},
	year = {2011},
}

@article{anderson_continuous_1997,
	title = {Continuous profiling: where have all the cycles gone?},
	volume = {15},
	issn = {0734-2071},
	shorttitle = {Continuous profiling},
	url = {https://doi.org/10.1145/265924.265925},
	doi = {10.1145/265924.265925},
	abstract = {This article describes the Digital Continuous Profiling Infrastructure, a sampling-based profiling system designed to run continuously on production systems. The system supports multiprocessors, works on unmodified executables, and collects profiles for entire systems, including user programs, shared libraries, and the operating system kernel. Samples are collected at a high rate (over 5200 samples/sec. per 333MHz processor), yet with low overhead (1–3\% slowdown for most workloads). Analysis tools supplied with the profiling system use the sample data to produce a precise and accurate accounting, down to the level of pipeline stalls incurred by individual instructions, of where time is bring spent. When instructions incur stalls, the tools identify possible reasons, such as cache misses, branch mispredictions, and functional unit contention. The fine-grained instruction-level analysis guides users and automated optimizers to the causes of performance problems and provides important insights for fixing them.},
	number = {4},
	urldate = {2021-07-17},
	journal = {ACM Transactions on Computer Systems},
	author = {Anderson, Jennifer M. and Berc, Lance M. and Dean, Jeffrey and Ghemawat, Sanjay and Henzinger, Monika R. and Leung, Shun-Tak A. and Sites, Richard L. and Vandevoorde, Mark T. and Waldspurger, Carl A. and Weihl, William E.},
	month = nov,
	year = {1997},
	keywords = {profiling, program analysis, performance understanding, performance-monitoring hardware},
	pages = {357--390},
}

@article{moudgill_environment_1999,
	title = {Environment for {PowerPC} microarchitecture exploration},
	volume = {19},
	issn = {1937-4143},
	doi = {10.1109/40.768496},
	abstract = {Designers face many choices when planning a new high-performance, general purpose microprocessor. Options include superscalar organization (the ability to dispatch and execute more than one instruction at a time), out-of-order issue of instructions, speculative execution, branch prediction, and cache hierarchy. However, the interaction of multiple microarchitecture features is often counterintuitive, raising questions concerning potential performance benefits and other effects on various workloads. Complex design trade-offs require accurate and timely performance modeling, which in turn requires flexible, efficient environments for exploring microarchitecture processor performance. Workload-driven simulation models are essential for microprocessor design space exploration. A processor model must ideally: capture in sufficient detail those features that are already well defined; make evolving assumptions and approximations in interpreting the desired execution semantics for those features that are not yet well defined; and be validated against the existing specification. These requirements suggest the need for an evolving but reasonably precise specification, so that validating against such a specification provides confidence in the results. Processor model validation normally relies on behavioral timing specifications based on test cases that exercise the microarchitecture. This approach, commonly used in simulation-based functional validation methods, is also useful for performance validation. In this article, we describe a workload driven simulation environment for PowerPC processor microarchitecture performance exploration. We summarize the environment's properties and give examples of its usage.},
	number = {3},
	journal = {IEEE Micro},
	author = {Moudgill, M. and Wellman, J.-D. and Moreno, J.H.},
	month = may,
	year = {1999},
	note = {Conference Name: IEEE Micro},
	keywords = {Operating systems, Process design, Runtime, Decoding, Libraries, Microarchitecture, Analytical models, Measurement, Throughput, Workstations},
	pages = {15--25},
	file = {IEEE Xplore Abstract Record:files/3900/768496.html:text/html},
}

@article{giladi_spec_1995,
	title = {{SPEC} as a performance evaluation measure},
	volume = {28},
	issn = {1558-0814},
	doi = {10.1109/2.402073},
	abstract = {Potential computer system users or buyers usually employ a computer performance evaluation technique only if they believe its results provide valuable information. System Performance Evaluation Cooperative (SPEC) measures are perceived to provide such information and are therefore the ones most commonly used. SPEC measures are designed to evaluate the performance of engineering and scientific workstations, personal vector computers, and even minicomputers and superminicomputers. Along with the Transaction Processing Council (TPC) measures for database I/O performance, they have become de facto industry standards, but do SPEC's evaluation outcomes actually provide added information value? In this article, we examine these measures by considering their structure, advantages and disadvantages. We use two criteria in our examination: are the programs used in the SPEC suite properly blended to reflect a representative mix of different applications, and are they properly synthesized so that the aggregate measures correctly rank computers by performance? We conclude that many programs in the SPEC suites are superfluous; the benchmark size can be reduced by more than 50\%. The way the measure is calculated may cause distortion. Substituting the harmonic mean for the geometric mean used by SPEC roughly preserves the measure, while giving better consistency. SPEC measures reflect the performance of the CPU rather than the entire system. Therefore, they might be inaccurate in ranking an entire system. To remedy these problems, we propose a revised methodology for obtaining SPEC measures.{\textless}{\textgreater}},
	number = {8},
	journal = {Computer},
	author = {Giladi, R. and Ahitav, N.},
	month = aug,
	year = {1995},
	note = {Conference Name: Computer},
	keywords = {Application software, System performance, Workstations, Computer performance, Councils, Design engineering, Distortion measurement, Measurement standards, Microcomputers, Transaction databases},
	pages = {33--42},
	file = {IEEE Xplore Abstract Record:files/3902/402073.html:text/html},
}

@misc{intel_intel_2016,
	title = {Intel® 64 and {IA}-32 {Architectures} {Software} {Developer}’s {Manual}},
	author = {Intel},
	year = {2016},
}

@inproceedings{li_ivy_1988,
	title = {{IVY}: {A} {Shared} {Virtual} {Memory} {System} for {Parallel} {Computing}},
	booktitle = {Proceedings of the {International} {Conference} on {Parallel} {Processing}, {ICPP} '88, {The} {Pennsylvania} {State} {University}, {University} {Park}, {PA}, {USA}, {August} 1988. {Volume} 2: {Software}},
	publisher = {Pennsylvania State University Press},
	author = {Li, Kai},
	year = {1988},
	pages = {94--101},
	file = {li-ivy.pdf:files/3907/li-ivy.pdf:application/pdf},
}

@article{binkert_gem5_2011,
	title = {The gem5 simulator},
	volume = {39},
	url = {https://doi.org/10.1145/2024716.2024718},
	doi = {10.1145/2024716.2024718},
	number = {2},
	journal = {ACM SIGARCH Computer Architecture News},
	author = {Binkert, Nathan and Beckmann, Bradford and Black, Gabriel and Reinhardt, Steven K. and Saidi, Ali and Basu, Arkaprava and Hestness, Joel and Hower, Derek R. and Krishna, Tushar and Sardashti, Somayeh and Sen, Rathijit and Sewell, Korey and Shoaib, Muhammad and Vaish, Nilay and Hill, Mark D. and Wood, David A.},
	year = {2011},
	note = {Publisher: Association for Computing Machinery (ACM)},
	pages = {1--7},
}
